import gc
import pandas as pd
import polars as pl
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline
import warnings
warnings.filterwarnings('ignore')
df = pl.read_parquet('../data/train.parquet')
test = pl.read_parquet('../data/test.parquet')
df.head()
| Duration | Distance | PLong | PLatd | DLong | DLatd | Haversine | Pmonth | Pday | Phour | Pmin | PDweek | Dmonth | Dday | Dhour | Dmin | DDweek | Temp | Precip | Wind | Humid | Solar | Snow | GroundTemp | Dust |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| i64 | i64 | f64 | f64 | f64 | f64 | f64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | i64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
| 28 | 8480 | 37.530167 | 127.007439 | 37.535221 | 127.068398 | 5.404561 | 4 | 14 | 19 | 48 | 5 | 4 | 14 | 20 | 16 | 5 | 9.5 | 0.0 | 2.3 | 76.0 | 0.02 | 0.0 | 9.9 | 13.0 |
| 28 | 4610 | 37.512104 | 127.10778 | 37.531013 | 127.142365 | 3.704593 | 9 | 10 | 20 | 22 | 0 | 9 | 10 | 20 | 52 | 0 | 23.4 | 0.0 | 1.8 | 48.0 | 0.0 | 0.0 | 22.5 | 17.0 |
| 43 | 7470 | 37.557968 | 126.838287 | 37.557461 | 126.861458 | 2.043273 | 3 | 30 | 18 | 15 | 4 | 3 | 30 | 19 | 0 | 4 | 17.4 | 0.0 | 3.1 | 57.0 | 0.52 | 0.0 | 17.2 | 52.0 |
| 7 | 890 | 37.610523 | 127.059799 | 37.615299 | 127.064468 | 0.671691 | 9 | 24 | 20 | 29 | 0 | 9 | 24 | 20 | 37 | 0 | 17.6 | 0.0 | 1.4 | 57.0 | 0.0 | 0.0 | 16.2 | 10.0 |
| 45 | 5180 | 37.653015 | 127.046997 | 37.653015 | 127.046997 | 0.0 | 8 | 15 | 20 | 16 | 2 | 8 | 15 | 21 | 4 | 2 | 28.4 | 0.0 | 1.5 | 69.0 | 0.01 | 0.0 | 26.9 | 0.0 |
df.describe()
| describe | Duration | Distance | PLong | PLatd | DLong | DLatd | Haversine | Pmonth | Pday | Phour | Pmin | PDweek | Dmonth | Dday | Dhour | Dmin | DDweek | Temp | Precip | Wind | Humid | Solar | Snow | GroundTemp | Dust |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
| "count" | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 | 7.680911e6 |
| "null_count" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| "mean" | 25.801542 | 3713.363587 | 37.547609 | 126.990825 | 37.547692 | 126.990764 | 1.869877 | 7.579236 | 15.765768 | 14.380114 | 29.150036 | 2.94393 | 7.579684 | 15.765365 | 14.428337 | 29.994824 | 2.945863 | 18.56178 | 0.017822 | 1.819315 | 53.978426 | 0.729826 | 0.009986 | 20.721958 | 32.451052 |
| "std" | 25.042927 | 3957.137315 | 0.04441 | 0.082572 | 0.044448 | 0.083235 | 1.995523 | 2.696767 | 8.74875 | 6.248187 | 17.25869 | 1.975944 | 2.696752 | 8.748496 | 6.483618 | 17.30883 | 1.977546 | 9.498508 | 0.372262 | 0.971008 | 17.422591 | 0.917215 | 0.163083 | 12.650722 | 24.071132 |
| "min" | 1.0 | 1.0 | 37.437271 | 126.798599 | 37.437271 | 126.798599 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | -17.8 | 0.0 | 0.0 | 10.0 | 0.0 | 0.0 | -13.6 | 0.0 |
| "25%" | 8.0 | 1230.0 | 37.51424 | 126.920036 | 37.51424 | 126.919395 | 0.675602 | 6.0 | 8.0 | 10.0 | 14.0 | 1.0 | 6.0 | 8.0 | 10.0 | 15.0 | 1.0 | 12.2 | 0.0 | 1.1 | 41.0 | 0.0 | 0.0 | 11.6 | 16.0 |
| "50%" | 16.0 | 2280.0 | 37.54707 | 126.994263 | 37.546547 | 126.994682 | 1.257281 | 8.0 | 16.0 | 16.0 | 29.0 | 3.0 | 8.0 | 16.0 | 16.0 | 30.0 | 3.0 | 20.1 | 0.0 | 1.7 | 53.0 | 0.27 | 0.0 | 21.1 | 27.0 |
| "75%" | 36.0 | 4620.0 | 37.573242 | 127.061897 | 37.573242 | 127.062424 | 2.363081 | 10.0 | 23.0 | 19.0 | 44.0 | 5.0 | 10.0 | 23.0 | 20.0 | 45.0 | 5.0 | 25.5 | 0.0 | 2.4 | 67.0 | 1.26 | 0.0 | 28.5 | 42.0 |
| "max" | 119.0 | 33290.0 | 37.68972 | 127.180267 | 37.68972 | 127.180267 | 28.63448 | 12.0 | 31.0 | 23.0 | 59.0 | 6.0 | 12.0 | 31.0 | 23.0 | 59.0 | 6.0 | 39.4 | 35.0 | 7.4 | 98.0 | 3.52 | 8.8 | 62.2 | 304.0 |
test.describe()
| describe | Duration | Distance | PLong | PLatd | DLong | DLatd | Haversine | Pmonth | Pday | Phour | Pmin | PDweek | Dmonth | Dday | Dhour | Dmin | DDweek | Temp | Precip | Wind | Humid | Solar | Snow | GroundTemp | Dust |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
| "count" | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 | 1.920228e6 |
| "null_count" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| "mean" | 25.808519 | 3714.139242 | 37.547623 | 126.990696 | 37.54771 | 126.990626 | 1.872064 | 7.576826 | 15.772621 | 14.369577 | 29.147249 | 2.94334 | 7.577254 | 15.771827 | 14.417649 | 29.984598 | 2.945401 | 18.562923 | 0.017954 | 1.819109 | 53.964548 | 0.731035 | 0.009802 | 20.730981 | 32.485708 |
| "std" | 25.044932 | 3954.507378 | 0.044429 | 0.082564 | 0.044473 | 0.083247 | 1.994711 | 2.696917 | 8.74506 | 6.253466 | 17.257083 | 1.976537 | 2.696894 | 8.744956 | 6.48799 | 17.312682 | 1.977935 | 9.496344 | 0.379042 | 0.971095 | 17.417572 | 0.918641 | 0.161709 | 12.658165 | 24.078638 |
| "min" | 1.0 | 1.0 | 37.437271 | 126.798599 | 37.437271 | 126.798599 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | -17.8 | 0.0 | 0.0 | 10.0 | 0.0 | 0.0 | -13.6 | 0.0 |
| "25%" | 8.0 | 1230.0 | 37.51424 | 126.919991 | 37.51424 | 126.919395 | 0.676378 | 6.0 | 8.0 | 10.0 | 14.0 | 1.0 | 6.0 | 8.0 | 10.0 | 15.0 | 1.0 | 12.2 | 0.0 | 1.1 | 41.0 | 0.0 | 0.0 | 11.6 | 16.0 |
| "50%" | 16.0 | 2280.0 | 37.546848 | 126.994263 | 37.546547 | 126.994263 | 1.258823 | 8.0 | 16.0 | 16.0 | 29.0 | 3.0 | 8.0 | 16.0 | 16.0 | 30.0 | 3.0 | 20.1 | 0.0 | 1.7 | 53.0 | 0.27 | 0.0 | 21.1 | 27.0 |
| "75%" | 36.0 | 4630.0 | 37.573242 | 127.061768 | 37.573242 | 127.062424 | 2.366374 | 10.0 | 23.0 | 19.0 | 44.0 | 5.0 | 10.0 | 23.0 | 19.0 | 45.0 | 5.0 | 25.5 | 0.0 | 2.4 | 67.0 | 1.26 | 0.0 | 28.6 | 42.0 |
| "max" | 119.0 | 33290.0 | 37.68972 | 127.180267 | 37.68972 | 127.180267 | 24.978334 | 12.0 | 31.0 | 23.0 | 59.0 | 6.0 | 12.0 | 31.0 | 23.0 | 59.0 | 6.0 | 39.4 | 35.0 | 7.4 | 98.0 | 3.52 | 8.8 | 62.2 | 304.0 |
hist_train = go.Histogram(x=df['Duration'], name='Train')
hist_test = go.Histogram(x=test['Duration'], name='Test')
fig = make_subplots()
fig.add_trace(hist_train)
fig.add_trace(hist_test)
fig.update_layout(
title='Distribution of Duration',
barmode='overlay',
xaxis_title_text='Duration',
yaxis_title_text='Count'
)
fig.show()